Paisley data set available here.

# Install packages (if needed)
install.packages("tidyverse")
install.packages("xlsx")
install.packages("writexl")



# Start afresh
rm(list=ls()) # Clear de "Global Environment"

# Load the packages you need
library(tidyverse) 
library(readxl)
library(writexl)



# Working directory
getwd() # Provides the current working directory
        # Helpful to see how the paths are defined (Windows/Mac differences)

setwd("/Volumes/francijb/Documents/FRAN/Teaching/QM Research School/Session") # Mac
setwd("W:/Documents/FRAN/Teaching/QM Research School/Session")                # Windows
  # RStudio menu helps finding the path: 
  #   Session/Set working directory/Choose directory
  #   Copy and paste the chosen path from the console to the script


# Import data
paisley_data <- read_excel("/Volumes/francijb/Documents/FRAN/Teaching/QM Research School/Datasets/paisley_data.xls")
paisley_data <- read_excel("W:/Documents/FRAN/Teaching/QM Research School/Datasets/paisley_data.xls")
    # if the data is in the working directory, you don't need the path (just the file name)

# Inspecting the data
paisley_data       # A peek 
    # It also indicates whether categorical (character) or numerical (double...)

view(paisley_data) # The whole dataset


#################### DESCRIPTIVE STATISTICS (1 variable) ###################


############## Categorical (qualitative) variables ############
# sex literacy employed ...

# Frequency table (tabulate)

# 1 variable: 
  # number of males and females
table(paisley_data$sex)

  # number by country of birth
table(paisley_data$countryb)

  # express it as a proportion (instead of a count)
prop.table(table(paisley_data$sex)) 

# or
paisley_data %>% 
  group_by(sex) %>%       # dimension we focus on
  summarize(n = n()) %>%  # reporting number of observations in each group
  mutate(freq = n/sum(n)) # creating a new variable with the relative frequency 

  # with literacy
paisley_data %>% 
  group_by(literacy) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))

  # focusing on a subsample
paisley_data %>% 
  filter(sex == "male") %>%   # "filter" restricts the analysis to those
  group_by(literacy) %>%      #   observations fulfilling that condition
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))

### Create an output that you can use later

table1 <- paisley_data %>% 
  filter(sex == "male") %>%   
  group_by(literacy) %>%      
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))
view(table1)

# Export the output (object) into excel 
write_xlsx(table1, path = "table1.xlsx")



###### Plotting frequencies (graph bar)
  # use "+" to add features to the graph

# sex
ggplot(data = paisley_data) +       
  geom_bar(mapping = aes(x = sex))

  # or
paisley_data %>% 
  ggplot() + 
  geom_bar(mapping = aes(x = sex))

# literacy
paisley_data %>% 
  ggplot() + 
  geom_bar(mapping = aes(x = literacy))

# horizontal to facilitate reading the categories
paisley_data %>% 
  ggplot() + 
  geom_bar(mapping = aes(x = literacy)) +
  coord_flip()  

# proportions (instead of counts)
paisley_data %>% 
  ggplot() + 
  geom_bar(mapping = aes(x = literacy, y = stat(prop), group = 1)) +
  coord_flip()

# editing the graph
paisley_data %>% 
  ggplot() + 
  geom_bar(mapping = aes(x = literacy, y = stat(prop), group = 1)) +
  coord_flip() + ylab("Proportion") + xlab("Literacy")
  # or all together: 
  # labs(title ="", subtitle = "", x = "", etc etc)
  # many options for editing graphs... 

# save the graph
ggsave("lit.png", dpi = 320)


#### Recategorising 

# Creating a new variable and recoding new values 
# i.e. simplify literacy so there are not so many categories

paisley_data %>% 
  group_by(literacy) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))

paisley_data <- mutate(paisley_data, lit_adj = recode(literacy, 
      "superior education" = "write", 
      "read well" = "read", 
      "read tolerably" = "read", 
      "read a little" = "read",
      "read & write well" = "write", 
      "read & write tolerably" = "write", 
      "read & write a little" = "read", 
      "cannot write" = "read")
       )
  # "mutate" creates a new variable based on an existing one (literacy)
  # "recode" makes the adjustments we ask for
  # note that there is no need to "illiterate" = "illiterate" 
  # (because it is already in the original variable) 
  # "paisley_data <-" tells R to alter the dataset 

  # Check the outcome (always)
View(paisley_data)   
table(paisley_data$lit_adj)
paisley_data %>% 
  ggplot() + 
  geom_bar(mapping = aes(x = lit_adj)) +
  coord_flip()
 

#### Ranking (ordering) 
# Assign order to qualitative variables
# The computer does not distinguish between categorical values
# Note that they are presented in alphabetical order
paisley_data %>% 
  group_by(literacy) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))

paisley_data$lit_adj2 <- factor(paisley_data$literacy, 
                       levels = c("illiterate", "read a little", "read tolerably", "read well",
                                  "cannot write", "read & write a little", "read & write tolerably",
                                  "read & write well", "superior education"), 
                       ordered = TRUE)
    # the ranking is given by the order of categories listed in "levels" (in ascending order)
    # notice that we have created a new variable

paisley_data$lit_adj2
  # although the values have not changed, they have been assigned a ranking: see "Levels" in the last line of the results

  # checking the result
paisley_data %>% 
  ggplot() +
  geom_bar(mapping = aes(x = lit_adj2)) +
  coord_flip()

# excluding now the missing values (NA): "!is.na(lit_adj)"
paisley_data %>% 
  filter(!is.na(lit_adj2)) %>%
  ggplot() +
  geom_bar(mapping = aes(x = lit_adj2)) +
  coord_flip()


# Ranking can be also obtained by assigning numerical values directly:
paisley_data <- mutate(paisley_data, lit_adj3 = recode(literacy,   
                                                       "superior education" = 8, 
                                                       "read & write well" = 7, 
                                                       "read & write tolerably" = 6,
                                                       "read & write a little" = 5,
                                                       "cannot write" = 4,
                                                       "read well" = 3, 
                                                       "read tolerably" = 2, 
                                                       "read a little" = 1,
                                                       "illiterate" = 0) 
                                                       )
view(paisley_data)

paisley_data %>% 
  group_by(lit_adj3) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))

ggplot(data = filter(paisley_data, !is.na(lit_adj3))) + # excluding the missing values (NA)
  geom_bar(mapping = aes(x = lit_adj2))


### Categorise more complicated patterns
    # i.e. the variable "marks" provides many "messy" categories; how to make sense of it
    # more on this with Gregory
paisley_data %>% 
  group_by(marks) %>%
  summarize(n = n()) %>% 
  print(n = Inf) # display all rows

install.packages("stringr")
library(stringr)
  
    # identify the prisoners where the term "scar" is mentioned in "marks"
paisley_data <- paisley_data %>% 
  mutate(scar = ifelse(str_detect(marks, "scar"), 1, 0))
      # new variable named "cut" with value of 1 when R detects the term "cut" (and 0 otherwise)
view(paisley_data)

paisley_data %>% 
  subset(select=c(marks, scar)) %>%
  print(n = 100) 

# or alternatively:
  
  # first create a new variable
paisley_data <- paisley_data %>% 
  mutate(marks_adj = marks)

  # keep replacing values
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "scar")] <- "scar"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "cut")] <- "cut"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "mark")] <- "mark"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "no mark")] <- "none"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "nomark")] <- "none"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "blind")] <- "blind"
paisley_data$marks_adj[str_detect(paisley_data$marks_adj, "lame")] <- "lame"

paisley_data %>% 
  subset(select=c(marks, marks_adj)) %>%
  print(n = 100) 

paisley_data %>% 
  group_by(marks_adj) %>%
  summarize(n = n()) %>% 
  print(n = Inf) # display all rows

############## Numerical variables ############
# age, weight, height, ...
paisley_data

# Reporting frequencies is not useful when there are many values
paisley_data %>% 
  group_by(age) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n)) %>%
  print(n = Inf) # display all rows

# Solution: Create class intervals (group values into bins)
paisley_data <- paisley_data %>% 
  mutate(age_class = cut(age, breaks = 5)) # Creates groups of equal size
  
  # or setting when the breaks happen yourself
paisley_data <- paisley_data %>% 
  mutate(age_class = cut(age, breaks = c(0, 14, 19, 50, Inf)))

  # or even assign "labels" to those groups
paisley_data <- paisley_data %>% 
  mutate(age_class = cut(age, breaks = c(0, 14, 19, 50, Inf), 
                         labels = c("Children", "Youngters", "Adults", "Elderly")))

  # check
paisley_data %>% 
  group_by(age_class) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n)) %>%
  print(n = Inf) 


## Summarise: 
  # count: n(), mean, median, min, max, sd, IQR, min, quantile(x, 0.25)...
  # first, nth(x, 2), last
  # !is.na(x), n_distinct(x)

  # average age
summarise(paisley_data, age = mean(age, na.rm = TRUE))
        # na.rm removes missing values from the computations
        # instead of yielding missing outputs (check removing that condition)

  # or
paisley_data %>% 
  summarize(count = sum(!is.na(age)),   # "!is.na" indicates "is not na" (not available=missing value)
            mean_age = mean(age, na.rm = TRUE), # the comma allows for asking for more statistics
            min_age = min(age, na.rm = TRUE),
            max_age = max(age, na.rm = TRUE),
            p25_age = quantile(age, 0.25, na.rm = TRUE),
            p75_age = quantile(age, 0.75, na.rm = TRUE)  
  )

  ## or
summary(paisley_data$age, na.rm = T)

# Summarise by group(s): 
paisley_data %>% 
  group_by(sex) %>%
  summarize(count = sum(!is.na(age)),
            mean_age = mean(age, na.rm = TRUE),
            min_age = min(age, na.rm = TRUE),
            max_age = max(age, na.rm = TRUE)
  )

  # by several groups
paisley_data %>% 
  group_by(sex, countryb) %>%
  summarize(count = sum(!is.na(age)), 
            mean_age = mean(age, na.rm = TRUE),
            median_age = median(age, na.rm = TRUE)
  )

  # store the results in a new object
by_sex_countryb <- paisley_data %>%   
  group_by(sex, countryb) %>%
  summarize(count = sum(!is.na(age)), 
            mean_age = mean(age, na.rm = TRUE),
            median_age = median(age, na.rm = TRUE)
  )
view(by_sex_countryb)

# Export to excel (note that is appended as a differet sheet in the previous file)
write_xlsx(by_sex_countryb, path = "table2.xlsx")


## Histogram (visual representation of the distribution)
paisley_data %>%   
  ggplot() +
  geom_histogram(mapping = aes(x = age), binwidth = 5)
    # Play around with the width of the bin: accuracy vs noise

ggsave("hist_age.pdf") # save plot

# Focusing on particular subsamples
paisley_data %>%   
  filter(sex == "male") %>% 
  ggplot() +
  geom_histogram(mapping = aes(x = age), binwidth = 5)

# Or by group
paisley_data %>%   
  ggplot() +
  geom_histogram(mapping = aes(x = age), binwidth = 5) +
  facet_wrap(~ sex, nrow = 1)


## Outlier / recode value
summary(paisley_data$age, na.rm = T)
subset(paisley_data, age==160)
# or
outlier <- subset(paisley_data, age==160)
view(outlier)
# if we check the source, it was a typo, real age 16, so we correct it:
paisley_data$age[paisley_data$age == 160] <- 16 

# check the result
subset(paisley_data, forename=="FRANCIS" & surname=="GILFILLAN", select=c(forename, surname, age))

paisley_data %>%   
  ggplot() +
  geom_histogram(mapping = aes(x = age), binwidth = 5) +
  facet_wrap(~ sex, nrow = 1)

# Overlapping histograms
paisley_data %>%   
  ggplot(mapping = aes(x = age, colour = sex)) +
  geom_freqpoly(binwidth = 5)

# With relative frequency (instead of counts) so as to compare different-size groups
paisley_data %>%  
  ggplot() +
  geom_histogram(mapping = aes(x = age, y = ..density..), binwidth = 5) +
  facet_wrap(~ sex, nrow = 1)



##### Modifying / creating variables

# Inspect info on heights: feet, inches
view(paisley_data)
paisley_data %>%  
  ggplot() +
  geom_histogram(mapping = aes(x = feet), binwidth = 1)
  
  # something is going on
paisley_data %>% 
  summarize(count = sum(!is.na(feet)),
            mean_feet = mean(feet, na.rm = TRUE),
            min_feet = min(feet, na.rm = TRUE),
            max_feet = max(feet, na.rm = TRUE)
  )

subset(paisley_data, feet==50)

# correct outlier
paisley_data$feet[paisley_data$feet == 50] <- 5

  # check you dit it right
subset(paisley_data, forename=="FRANCIS" & surname=="GILFILLAN", select=c(forename, surname, feet))

  # or
paisley_data %>%  
  ggplot() +
  geom_histogram(mapping = aes(x = feet), binwidth = 1)

  # The variable "inches" looks ok
paisley_data %>%  
  ggplot() +
  geom_histogram(mapping = aes(x = inches), binwidth = 1)


# Create nex variable (height) combining both variables (feet & inches)
paisley_data <- mutate(paisley_data, 
                       height = 30.48*feet+2.54*inches)
view(paisley_data)
  # a new variable has been added (last column on the right)

  # visualise new variable
paisley_data %>%  
  ggplot() +
  geom_histogram(mapping = aes(x = height), binwidth = 5)


# Group a numerical variable into different bins
paisley_data <- paisley_data %>%
  mutate(height_bins = case_when(height < 145 ~ 'low',
                                 height >=145 & height< 175 ~ 'med',
                                 height >=175 ~ 'high'))
  # check
paisley_data %>% 
  group_by(height_bins) %>%
  summarize(n = n()) %>%
  mutate(freq = n/sum(n))


##### Save the data ####
# The data has somewhat change: correct typos, add new variables...
# We can save the new dataset as a R file, so we can come back to that later
write_rds(paisley_data, "paisley_v2.rds")
paisley_data_1 <- read_rds("paisley_v2.rds")





#### Measures of dispersion: range, iqr, variance, standard deviation ####

# Distributions differ not only in their means (medians...) but also in
# how dispersed their values are

paisley_data %>%  
  group_by(sex) %>%
  summarize(count = sum(!is.na(age)),   
            mean_age = mean(age, na.rm = TRUE), 
            min_age = min(age, na.rm = TRUE),
            max_age = max(age, na.rm = TRUE),
            range_age = max(age, na.rm = TRUE) - min(age, na.rm = TRUE),  # range: max - min
            iqr_age = IQR(age, na.rm = TRUE),                             # interquartile range: p75 - p25
            var_age = var(age, na.rm = TRUE),                             # variance
            sd_age = sd(age, na.rm = TRUE),                               # standard deviation
            cv_age = sd(age, na.rm = TRUE)/mean(age, na.rm = TRUE)        # coefficient of variation
  )

paisley_data %>%  
  filter(age >= 18) %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(height)),   
            mean = mean(height, na.rm = TRUE), 
            min = min(height, na.rm = TRUE),
            max = max(height, na.rm = TRUE),
            range = max(height, na.rm = TRUE) - min(height, na.rm = TRUE),  # range: max - min
            iqr = IQR(height, na.rm = TRUE),                             # interquartile range: p75 - p25
            var = var(height, na.rm = TRUE),                             # variance
            sd = sd(height, na.rm = TRUE),                               # standard deviation
            cv = sd(height, na.rm = TRUE)/mean(height, na.rm = TRUE)        # coefficient of variation
  )

paisley_data %>%  
  ggplot() +
ggplot(data = paisley_data, mapping = aes(x = age, colour = sex)) +
  geom_freqpoly(binwidth = 5)


# Visualising
paisley_data %>%  
  filter(age >= 18) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = height), binwidth = 2) +
  facet_wrap(~ sex, nrow = 1)


2022.